library(dplyr);
library(magrittr);
library(ggplot2);
library(lubridate);
library(readr);
df <- read.csv("./marathon_results_2017.csv", header=TRUE, stringsAsFactors=FALSE)
df <- df[c('Age', 'M.F', 'X5K', 'X10K', 'X15K', 'X20K', 'X25K', 'X30K', 'X35K', 'X40K', 'Official.Time')]
df %<>% filter(X5K != '-' & X10K != '-' & X15K != '-' & X20K != '-' & X25K != '-' & X30K != '-' & X35K != '-' & X40K != '-')
df
cols <- c('X5K', 'X10K', 'X15K', 'X20K', 'X25K', 'X30K', 'X35K', 'X40K')
df %<>% mutate_each_(funs(as.POSIXct(., format="%H:%M:%S")), cols);
`mutate_each()` is deprecated.
Use `mutate_all()`, `mutate_at()` or `mutate_if()` instead.
To map `funs` over a selection of variables, use `mutate_at()`
df$X40K <- as.numeric(difftime(df$X40K, df$X35K, units='secs'))
df$X35K <- as.numeric(difftime(df$X35K, df$X30K, units='secs'))
df$X30K <- as.numeric(difftime(df$X30K, df$X25K, units='secs'))
df$X25K <- as.numeric(difftime(df$X25K, df$X20K, units='secs'))
df$X20K <- as.numeric(difftime(df$X20K, df$X15K, units='secs'))
df$X15K <- as.numeric(difftime(df$X15K, df$X10K, units='secs'))
df$X10K <- as.numeric(difftime(df$X10K, df$X5K, units='secs'))
df$X5K <- as.numeric(difftime(df$X5K, as.POSIXct('00:00:00', format="%H:%M:%S"), units='secs'))
colnames(df)[colnames(df) == 'M.F'] <- 'Gender'
df
demo <- df %>%
mutate(Gender, Gender = ifelse('M' == Gender,'MEN', 'WOMEN')) %>%
mutate(Age, Age = ifelse(Age > 40, 'OLD', 'YOUNG')) %>%
group_by(Gender, Age) %>%
count()
demo$comb <- paste(demo$Age, demo$Gender)
demo
# pie charts in gg plot are just too much work
#library(scales)
#ggplot(demo, aes(x="", y=n, fill=factor(comb)))+
# geom_bar(width=1, stat="identity") +
# scale_fill_manual(values=c("#3617ff", "#e048ce", "#45caff", "#ffc4fb")) +
# coord_polar("y", start=0)
# Pie Chart with Percentages
slices <- demo$n
lbls <- demo$comb
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices, labels = lbls, col=c("blue", "cyan", "violet", "pink"), main="Distribution of gender and age")
Finishing times by gender
df$Official.Time <- as.POSIXct(df$Official.Time, format="%H:%M:%S")
ggplot(df, aes(df$Official.Time, fill = df$Gender)) +
geom_histogram(aes(y=..density..), alpha=0.6,
position="identity", lwd=0.2) +
ggtitle("Normalized")
df %>%
mutate(Age, Age = ifelse(Age > 40, 'OLD', 'YOUNG')) %>%
ggplot(aes(Official.Time, fill = Age)) +
geom_histogram(aes(y=..density..), alpha=0.6,
position="identity", lwd=0.2) +
ggtitle("Normalized")
n_groups <- 20
zebra_colormap <- rep(c("darkcyan", "cyan"), 20)
df <- df[df$Official.Time < quantile(df$Official.Time, 0.99), ]
#splits = split(df, cut(df$Official.Time, N)) # Time splits
#splits <- split(df, rep(1:ceiling(nrow(df)/N), each=N, length.out=nrow(df))) # N marathoners splits
#df$group <- rep(1:ceiling(nrow(df)/n_groups), each=nrow(df)/n_groups, length.out=nrow(df)) # N marathoners splits
df$Official.Time <- as.numeric(difftime(df$Official.Time, as.POSIXct('00:00:00', format="%H:%M:%S"), units='mins'))
df$group <- cut(df$Official.Time, n_groups)
ggplot(df) +
geom_point(aes(x=1:NROW(df), y=df$Official.Time, col=as.factor(df$group))) +
scale_color_manual(values=zebra_colormap) +
theme_bw()
Are women more disciplined than men?
women <- df %>%
filter(Gender == 'F')
men <- df %>%
filter(Gender == 'M')
b_splits = split(df, df$group) # Time splits
w_splits = split(women, women$group) # Time splits
m_splits = split(men, men$group) # Time splits
g_sd_df <- data.frame("group" = numeric(0),
"gender" = character(0),
"n" = numeric(0),
"mean_sd" = numeric(0),
"sd_sd" = numeric(0),
stringsAsFactors = FALSE)
for (i in 1:n_groups) {
gender <- 'M'
mean_sd <- as.numeric(m_splits[[i]] %>%
select(cols) %>%
transform(SD=apply(., 1, sd, na.rm = TRUE)) %>%
summarize(sample_sd = mean(SD, na.rm = TRUE), sd(SD, na.rm = TRUE)))
n <- as.numeric(m_splits[[i]] %>%
select(Official.Time) %>%
summarize(n = n()))
g_sd_df[nrow(g_sd_df) + 1,] = c(i, gender, n, mean_sd, sd_sd)
gender <- 'F'
mean_sd <- as.numeric(w_splits[[i]] %>%
select(cols) %>%
transform(SD=apply(., 1, sd, na.rm = TRUE)) %>%
summarize(sample_sd = mean(SD, na.rm = TRUE), sd(SD, na.rm = TRUE)))
n <- as.numeric(w_splits[[i]] %>%
select(Official.Time) %>%
summarize(n = n()))
if (n != 0) {
g_sd_df[nrow(g_sd_df) + 1,] = c(i, gender, n, mean_sd, sd_sd)
}
gender <- 'B'
mean_sd <- as.numeric(b_splits[[i]] %>%
select(cols) %>%
transform(SD=apply(., 1, sd, na.rm = TRUE)) %>%
summarize(sample_sd = mean(SD, na.rm = TRUE)))
n <- as.numeric(b_splits[[i]] %>%
select(Official.Time) %>%
summarize(n = n()))
g_sd_df[nrow(g_sd_df) + 1,] = c(i, gender, n, mean_sd, sd_sd)
}
data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]data length [6] is not a sub-multiple or multiple of the number of columns [5]
g_sd_df$n <- as.numeric(g_sd_df$n)
g_sd_df$mean_sd <- as.numeric(g_sd_df$mean_sd)
g_sd_df$sd_sd <- as.numeric(g_sd_df$sd_sd)
g_sd_df$group <- as.numeric(g_sd_df$group)
g_sd_df
ggplot(g_sd_df) +
geom_point(aes(x=group, y=mean_sd, colour=gender)) +
scale_color_manual(values=c('black', 'violet', 'blue')) +
theme_bw()
g_sd_df <- filter(g_sd_df, g_sd_df$gender == 'M' | g_sd_df$gender == 'F')
g_sd_df$error <- (1.96 * g_sd_df$sd_sd) / sqrt(g_sd_df$n)
g_sd_df
ggplot(g_sd_df, aes(x=group, y=mean_sd, colour=gender)) +
geom_point() +
geom_errorbar(aes(ymin=mean_sd - error, ymax=mean_sd + error)) +
scale_color_manual(values=c('violet', 'blue')) +
theme_bw()
if (!file.exists('animation.gif')) {
library(animation)
n_samples <- 20
sample <- df %>%
group_by(group) %>%
sample_n(n_samples, replace=TRUE)
times <- t(data.matrix(select(sample, cols)))
makeplot <- function() {
for(i in 1:nrow(sample)) {
plot.ts(times[cols,1:i],
plot.type="single",
lwd=0.5,
col=rep(rainbow(n_groups), each=n_samples),
ylim=c(900, 3000),
xlab='', ylab='', axes = F)
lines(times[cols,i],
lwd=2, col=1,
xlab='', ylab='', axes = F)
title(main="5K pace analysis", sub=paste('Group rank #', as.character(ceiling(i/n_samples))), xlab="", ylab="Split time (seconds)")
axis(side=2,at=c(800, 1000, 1500, 2000, 2500, 3000),labels=c('800', '1000', '1500', '2000', '2500', '3000'))
axis(side=1,at=c(-10,1,2,3,4,5,6,7,8),labels=c('','5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K'))
}
}
oopt = ani.options(interval = 0, nmax = n_runners)
saveGIF(makeplot(),interval = 0.1, width = 580, height = 400)
ani.options(oopt)
}